Data Visualization with Pandas and Matplotlib

# import library 
import pandas as pd 
import matplotlib.pyplot as plt 

# display plot in the notebook 
%matplotlib inline 

# set figuresize and fontsize 
plt.rcParams['figure.figsize'] = (8,6) 
plt.rcParams['font.size'] = 14 
# read data 
drink_cols = ["country", 'beer', 'spirit', 'wine', 'liters', 'continent']
drinks = pd.read_csv("../data/drinks.csv", header=0, names=drink_cols, na_filter=False)

Data Exploration

# examine first few rows 
drinks.head() 
country beer spirit wine liters continent
0 Afghanistan 0 0 0 0.0 AS
1 Albania 89 132 54 4.9 EU
2 Algeria 25 0 14 0.7 AF
3 Andorra 245 138 312 12.4 EU
4 Angola 217 57 45 5.9 AF
# observations and columns 
drinks.shape
(193, 6)
# data structure 
drinks.info() 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    193 non-null    object 
 1   beer       193 non-null    int64  
 2   spirit     193 non-null    int64  
 3   wine       193 non-null    int64  
 4   liters     193 non-null    float64
 5   continent  193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB
# numerical summary 
drinks.describe() 
beer spirit wine liters
count 193.000000 193.000000 193.000000 193.000000
mean 106.160622 80.994819 49.450777 4.717098
std 101.143103 88.284312 79.697598 3.773298
min 0.000000 0.000000 0.000000 0.000000
25% 20.000000 4.000000 1.000000 1.300000
50% 76.000000 56.000000 8.000000 4.200000
75% 188.000000 128.000000 59.000000 7.200000
max 376.000000 438.000000 370.000000 14.400000

Histogram: show the distribution of a numerical variable

# sort the beer columns and split it into 3 groups 
drinks.beer.sort_values().values
array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   1,   1,   1,   1,   2,   3,   5,   5,   5,   5,   5,
         6,   6,   6,   6,   8,   8,   8,   9,   9,   9,   9,  12,  13,
        15,  15,  16,  16,  17,  18,  19,  19,  20,  20,  21,  21,  21,
        21,  22,  23,  25,  25,  25,  25,  26,  28,  31,  31,  31,  31,
        32,  32,  34,  36,  36,  36,  37,  42,  42,  43,  44,  45,  47,
        49,  51,  51,  52,  52,  52,  53,  56,  56,  57,  58,  60,  62,
        62,  63,  64,  69,  71,  76,  76,  77,  77,  77,  78,  79,  82,
        82,  85,  88,  89,  90,  92,  93,  93,  98,  99, 102, 105, 106,
       109, 111, 115, 120, 122, 124, 127, 128, 130, 133, 140, 142, 143,
       144, 147, 149, 149, 152, 157, 159, 162, 163, 167, 169, 171, 173,
       185, 188, 192, 193, 193, 194, 194, 196, 197, 199, 203, 206, 213,
       217, 219, 224, 224, 225, 230, 231, 233, 234, 236, 238, 240, 245,
       245, 247, 249, 251, 261, 263, 263, 270, 279, 281, 283, 284, 285,
       295, 297, 306, 313, 333, 343, 343, 346, 347, 361, 376])
# compare with histogram 
drinks.beer.plot(kind="hist", bins=3);
../_images/MPL02-Data Visualization with Pandas and Matplotlib_10_0.png
# try more bins 
drinks.beer.plot(kind="hist", bins=20); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_11_0.png
# add title and labels 
drinks.beer.plot(kind="hist", bins=20, title="Histogram of Beer Servings")
plt.xlabel("Beer Survings") 
plt.ylabel("Frequency")
# show plot 
plt.show() 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_12_0.png
# compare with density plot(smooth version of a histogram) 
drinks.beer.plot(kind="density", xlim=(0, 500));
../_images/MPL02-Data Visualization with Pandas and Matplotlib_13_0.png

Scatter Plot: show the relationship between two numerical variables

# select the beer and wine columns and sort by beer 
drinks[["beer", "wine"]].sort_values(by="beer").values
array([[  0,   0],
       [  0,  74],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  1,   7],
       [  1,   1],
       [  1,   4],
       [  1,   1],
       [  2,   0],
       [  3,   1],
       [  5,   0],
       [  5,   0],
       [  5,  16],
       [  5,   1],
       [  5,   0],
       [  6,   1],
       [  6,   0],
       [  6,   1],
       [  6,   9],
       [  8,   0],
       [  8,   1],
       [  8,   1],
       [  9,   2],
       [  9,   0],
       [  9,   7],
       [  9,   0],
       [ 12,  10],
       [ 13,   0],
       [ 15,   3],
       [ 15,   1],
       [ 16,   5],
       [ 16,   0],
       [ 17,   1],
       [ 18,   0],
       [ 19,  32],
       [ 19,   2],
       [ 20,   0],
       [ 20,  31],
       [ 21,  11],
       [ 21,  11],
       [ 21,   5],
       [ 21,   1],
       [ 22,   1],
       [ 23,   0],
       [ 25,   8],
       [ 25,  14],
       [ 25,   2],
       [ 25,   7],
       [ 26,   4],
       [ 28,  21],
       [ 31, 128],
       [ 31,   6],
       [ 31,  10],
       [ 31,   1],
       [ 32,   4],
       [ 32,   1],
       [ 34,  13],
       [ 36,  19],
       [ 36,   5],
       [ 36,   1],
       [ 37,   7],
       [ 42,   2],
       [ 42,   7],
       [ 43,   0],
       [ 44,   1],
       [ 45,   0],
       [ 47,   5],
       [ 49,   8],
       [ 51,  20],
       [ 51,   7],
       [ 52,   2],
       [ 52, 149],
       [ 52,  26],
       [ 53,   2],
       [ 56, 140],
       [ 56,   1],
       [ 57,   1],
       [ 58,   2],
       [ 60,  11],
       [ 62,  18],
       [ 62, 123],
       [ 63,   9],
       [ 64,   4],
       [ 69,   2],
       [ 71,   1],
       [ 76,   8],
       [ 76,   9],
       [ 77,   8],
       [ 77,  16],
       [ 77,   1],
       [ 78,   1],
       [ 79,   8],
       [ 82,   9],
       [ 82,   0],
       [ 85, 237],
       [ 88,   0],
       [ 89,  54],
       [ 90,   2],
       [ 92, 233],
       [ 93,   5],
       [ 93,   1],
       [ 98,  18],
       [ 99,   1],
       [102,  45],
       [105,  24],
       [106,  86],
       [109,  18],
       [111,   1],
       [115, 220],
       [120,  11],
       [122,  51],
       [124,  12],
       [127, 370],
       [128,   7],
       [130, 172],
       [133, 218],
       [140,   9],
       [142,  42],
       [143,  36],
       [144,  16],
       [147,   4],
       [149, 120],
       [149,  11],
       [152, 186],
       [157,  51],
       [159,   3],
       [162,   3],
       [163,  21],
       [167,   8],
       [169, 129],
       [171,  71],
       [173,  35],
       [185, 280],
       [188,   7],
       [192, 113],
       [193,   9],
       [193, 221],
       [194, 339],
       [194,  32],
       [196, 116],
       [197,   7],
       [199,  28],
       [203, 175],
       [206,  45],
       [213,  74],
       [217,  45],
       [219, 195],
       [224,  59],
       [224, 278],
       [225,  81],
       [230, 254],
       [231,  94],
       [233,  78],
       [234, 185],
       [236, 271],
       [238,   5],
       [240, 100],
       [245, 312],
       [245,  16],
       [247,  73],
       [249,  84],
       [251, 190],
       [261, 212],
       [263,  97],
       [263,   8],
       [270, 276],
       [279, 191],
       [281,  62],
       [283, 127],
       [284, 112],
       [285,  18],
       [295, 212],
       [297, 167],
       [306,  23],
       [313, 165],
       [333,   3],
       [343,  56],
       [343,  56],
       [346, 175],
       [347,  59],
       [361, 134],
       [376,   1]])
# comapre with scatter plot 
drinks.plot(kind="scatter", x="beer", y="wine"); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_16_0.png
# add transparency 
drinks.plot(kind='scatter', x="beer", y="wine", alpha=0.3); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_17_0.png
# vary point color by spirit servings 
drinks.plot(kind="scatter", x="beer", y="wine", c="spirit", colormap="Blues"); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_18_0.png
# scatter matrix of 3 numerical columns 
pd.plotting.scatter_matrix(drinks[['beer', 'wine', 'spirit']]); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_19_0.png
# increase figure size 
# scatter matrix of 3 numerical columns 
pd.plotting.scatter_matrix(drinks[['beer', 'wine', 'spirit']], figsize=(10,8)); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_20_0.png

Bar Plot: show a numerical comparison across different categories

# count the number of countries in each continent 
drinks.continent.value_counts()
AF    53
EU    45
AS    44
NA    23
OC    16
SA    12
Name: continent, dtype: int64
# compare with bar plot 
drinks.continent.value_counts().plot(kind="bar"); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_23_0.png
# calculate the mean alcohol amounts for each continent 
drinks.groupby('continent').mean() 
beer spirit wine liters
continent
AF 61.471698 16.339623 16.264151 3.007547
AS 37.045455 60.840909 9.068182 2.170455
EU 193.777778 132.555556 142.222222 8.617778
NA 145.434783 165.739130 24.521739 5.995652
OC 89.687500 58.437500 35.625000 3.381250
SA 175.083333 114.750000 62.416667 6.308333
# side-by-side bar plots 
drinks.groupby('continent').mean().plot(kind='bar'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_25_0.png
# drop the liters column
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_26_0.png
# stacked bar plots 
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar', stacked=True); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_27_0.png

Box Plot: show quartiles (and outliers) for one or more numerical variables

Five-Number Summary

  • min = minimum value

  • 5% = first quartile (Q1) = median of the lower half of the data

  • 50% = second quartile (Q2) = median of the data

  • 75% = third quartile (Q3) = median of the upper half of the data

  • max = maximum value (More useful than mean and standard deviation for describing skewed distributions)

  • Interquartile Range (IQR) = Q3 - Q1

Outliers

  • below Q1 - 1.5 * IQR

  • above Q3 + 1.5 * IQR

# sort the spirit column 
drinks.spirit.sort_values().values 
array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,
         3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   5,   5,
         6,   6,   6,   7,   9,  11,  11,  12,  13,  15,  15,  16,  16,
        18,  18,  18,  18,  19,  21,  21,  22,  22,  25,  25,  27,  29,
        31,  31,  34,  35,  35,  35,  35,  38,  39,  41,  41,  42,  42,
        44,  46,  50,  51,  55,  56,  57,  60,  61,  63,  63,  65,  67,
        68,  69,  69,  69,  71,  71,  72,  74,  75,  76,  76,  79,  81,
        84,  87,  87,  88,  97,  97,  98,  98, 100, 100, 100, 100, 101,
       104, 104, 112, 114, 114, 114, 117, 117, 118, 118, 122, 122, 124,
       126, 128, 131, 132, 133, 133, 135, 137, 138, 145, 147, 151, 152,
       154, 156, 157, 158, 160, 170, 173, 173, 176, 178, 179, 186, 189,
       192, 194, 200, 202, 205, 215, 215, 216, 221, 226, 237, 244, 246,
       252, 254, 258, 286, 293, 302, 315, 326, 326, 373, 438])
# show five-number summary of spirit 
drinks.spirit.describe() 
count    193.000000
mean      80.994819
std       88.284312
min        0.000000
25%        4.000000
50%       56.000000
75%      128.000000
max      438.000000
Name: spirit, dtype: float64
# compare with boxplot 
drinks.spirit.plot(kind='box');  
../_images/MPL02-Data Visualization with Pandas and Matplotlib_31_0.png
# include multiple variables 
drinks.drop('liters', axis=1).plot(kind='box'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_32_0.png

Line Plot: show the trend of a numerical variable over time

# read ufo data 
ufo = pd.read_csv("../data/ufo.csv")
ufo['Time'] = pd.to_datetime(ufo.Time) 
ufo['Year'] = ufo.Time.dt.year 
# examine first few rows  
ufo.head() 
City Colors Reported Shape Reported State Time Year
0 Ithaca NaN TRIANGLE NY 1930-06-01 22:00:00 1930
1 Willingboro NaN OTHER NJ 1930-06-30 20:00:00 1930
2 Holyoke NaN OVAL CO 1931-02-15 14:00:00 1931
3 Abilene NaN DISK KS 1931-06-01 13:00:00 1931
4 New York Worlds Fair NaN LIGHT NY 1933-04-18 19:00:00 1933
# observations and columns 
ufo.shape 
(80543, 6)
# data structure 
ufo.info() 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80543 entries, 0 to 80542
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   City             80496 non-null  object        
 1   Colors Reported  17034 non-null  object        
 2   Shape Reported   72141 non-null  object        
 3   State            80543 non-null  object        
 4   Time             80543 non-null  datetime64[ns]
 5   Year             80543 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 3.7+ MB
# numerical summary 
ufo.describe()  
Year
count 80543.000000
mean 2004.178737
std 10.602487
min 1930.000000
25% 2001.000000
50% 2007.000000
75% 2011.000000
max 2014.000000
# count the number of ufo reports each year (and sort by year)
ufo.Year.value_counts().sort_index() 
1930       2
1931       2
1933       1
1934       1
1935       1
        ... 
2010    4154
2011    5089
2012    7263
2013    7003
2014    5382
Name: Year, Length: 82, dtype: int64
# compare with line plot 
ufo.Year.value_counts().sort_index().plot();
../_images/MPL02-Data Visualization with Pandas and Matplotlib_40_0.png
# don't use a line plot when there is no logical ordering 
drinks.continent.value_counts().plot(kind='line'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_41_0.png

Grouped Box Plots: show one box plot for each group

# remainder: boxplot of beer survings 
drinks.beer.plot(kind='box'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_43_0.png
# boxplot of beer survings group by continent 
drinks.boxplot(column='beer', by='continent'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_44_0.png
# boxplot of all numerical columns group by continent 
drinks.boxplot(by='continent'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_45_0.png

Grouped Histograms: show one histogram for each group

# remainder: histogram of beer survings 
drinks.beer.plot(kind='hist', bins=20); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_47_0.png
# histogram of beer  survings group by continent 
drinks.hist(column='beer', by='continent'); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_48_0.png
# share the x-axis 
drinks.hist(column='beer', by='continent', sharex=True); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_49_0.png
# share the x and y axis 
drinks.hist(column='beer', by='continent', sharex=True, sharey=True); 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_50_0.png
# change the layout 
drinks.hist(column='beer', by='continent', sharex=True, layout=(2, 3));
../_images/MPL02-Data Visualization with Pandas and Matplotlib_51_0.png

Assorted Functionality

# saving a plot to a file 
drinks.beer.plot(kind='hist', bins=20, title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
plt.ylabel("Freequency")
plt.savefig("beer_survings.png") # .png, .tiff, .pdf, .jpeg 
../_images/MPL02-Data Visualization with Pandas and Matplotlib_53_0.png
# list available plot style 
plt.style.available
['Solarize_Light2',
 '_classic_test_patch',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark',
 'seaborn-dark-palette',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'tableau-colorblind10']
# use plot style: ggplot 
plt.style.use('ggplot')
# histogram of beer survings in ggplot style 
drinks.beer.plot(kind="hist", title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
plt.ylabel("Frequnecy")
Text(0, 0.5, 'Frequnecy')
../_images/MPL02-Data Visualization with Pandas and Matplotlib_56_1.png
# use plot style: ggplot 
plt.style.use('seaborn') 
# histogram of beer survings in seaborn style 
drinks.beer.plot(kind="hist", title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
plt.ylabel("Frequnecy")
Text(0, 0.5, 'Frequnecy')
../_images/MPL02-Data Visualization with Pandas and Matplotlib_58_1.png
# use plot style: ggplot 
plt.style.use('fivethirtyeight') 
# histogram of beer survings in fivethirtyeight style 
drinks.beer.plot(kind="hist", title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
plt.ylabel("Frequnecy")
Text(0, 0.5, 'Frequnecy')
../_images/MPL02-Data Visualization with Pandas and Matplotlib_60_1.png